Geographical Analysis of media
7. Textmodels
Objectives
The aim of this section is to test basic methods of textual analysis for the analysis of vocabulary associated to the different macroregions. The question is how it is possible to apply these models in the different languages of our corpora and what are the issues for a genuine comparison. We will focus on the case of the difference of vocabulary for the two macroregional entities “EU” and “Europe” but try also some experiments with “NATO” or “Mediterranean”.
Preparation
Load quanteda annotated corpus
We load the quanteda annotated corpus and split it by media
qd<-readRDS("corpus/qd_mycorpus_geo_top.RDS")
qd_fr <- corpus_subset(qd, who=="FRA_figaro")
qd_de <- corpus_subset(qd, who=="DEU_suddeu")
qd_tr <- corpus_subset(qd, who=="TUR_dunya")
qd_tn <- corpus_subset(qd, who=="TUN_afrman")
td<-tidy(qd)Extract subcorpora of international news
We extract corpus and subcorpus of news where at least one state or one region are mentionned.
qd_int <- corpus_subset(qd, nbgeo >0)
qd_int_fr <- corpus_subset(qd_int, who=="FRA_figaro")
qd_int_de <- corpus_subset(qd_int, who=="DEU_suddeu")
qd_int_tr <- corpus_subset(qd_int, who=="TUR_dunya")
qd_int_tn <- corpus_subset(qd_int, who=="TUN_afrman")Extract subcorpora of macroregional news
We extract corpus and subcorpus of news where at least one state or one region are mentionned.
qd_reg <- corpus_subset(qd, nbregions >0)
qd_reg_fr <- corpus_subset(qd_reg, who=="FRA_figaro")
qd_reg_de <- corpus_subset(qd_reg, who=="DEU_suddeu")
qd_reg_tr <- corpus_subset(qd_reg, who=="TUR_dunya")
qd_reg_tn <- corpus_subset(qd_reg, who=="TUN_afrman")ANALYSIS 1 : WORDCLOUD OF MACROREGIONAL VOCABULARY
We consider firstly the vocablary of news associated to macroregion by our dictionary in order to discover if we can define a “macroregional vocabulary”
Tunisia / African Manager
sel<-qd_reg_tn
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Tunisie"," ",sel)
stopw<-stopwords("fr", source = "stopwords-iso")
dfm<- corpus_subset(sel) %>%
dfm(remove = stopw, remove_punct = TRUE, tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE)
set.seed(100)
textplot_wordcloud(dfm,max_words = 100,min_size=1, max_size=10, rotation=0)France /+ Le Figaro
sel<-qd_reg_fr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "France"," ",sel)
stopw<-stopwords("fr", source = "stopwords-iso")
dfm<- corpus_subset(sel) %>%
dfm(remove = stopw, remove_punct = TRUE, tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE,)
set.seed(100)
textplot_wordcloud(dfm,max_words = 100,min_size=1, max_size=12, rotation=0)Germany / Süddeutsche Zeitung
sel<-qd_reg_de
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Europas","Europa",sel)
sel<-gsub(pattern = "Europäische Union","UE",sel)
sel<-gsub(pattern = "Deutschland"," ",sel)
stopw<-stopwords("de", source = "stopwords-iso")
dfm<- corpus_subset(sel) %>%
dfm(remove = stopw, remove_punct = TRUE, tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE,)
set.seed(100)
textplot_wordcloud(dfm,max_words = 100,min_size=1, max_size=12, rotation=0)Turkey / Dunya
sel<-qd_reg_tr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Avrupa Birliği","AB",sel)
sel<-gsub(pattern = "Türkiye"," ",sel)
stopw<-stopwords("tr", source = "stopwords-iso")
dfm<- corpus_subset(sel) %>%
dfm(remove = stopw, remove_punct = TRUE, tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE,)
set.seed(100)
textplot_wordcloud(dfm,max_words = 100,min_size=1, max_size=10, rotation=0)td<-tidy(sel)ANALYSIS 2.1 : SPECIFIC VOCABULARY OF A MACROREGION / AFRICA
We take the example of “Africa” and examine what are the specific terms associated to this macroregion in the different corpora and subcorpora. We decide to use the “international” corpus as reference.
Tunisia
# Select the corpus
sel <- qd_int_tn
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Moyen Orient","Moyen_Orient",sel)
sel<-gsub(pattern = "Wall Street","Wall_Street",sel)
# Select a macroegion
sel$ref <- str_detect(sel$geo,"CO_AFR")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 16051 499
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("Afrique","2ème","3e","Orange")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = T)td<-tidy(sel)France
# Select the corpus
sel <- qd_int_fr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Moyen Orient","Moyen_Orient",sel)
sel<-gsub(pattern = "Wall Street","Wall_Street",sel)
# Select a macroegion
sel$ref <- str_detect(sel$geo,"CO_AFR")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 81732 364
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("Afrique","2ème","3e","19")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = T)td<-tidy(sel)Germany
# Select the corpus
sel <- qd_int_de
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Europas","Europa",sel)
sel<-gsub(pattern = "Europäische Union","UE",sel)
stopw<-stopwords("de", source = "stopwords-iso")
# Select a macroegion
sel$ref <- str_detect(sel$geo,"CO_AFR")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 24005 181
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("Afrika")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = T)td<-tidy(sel)Turkey
# Select the corpus
sel <- qd_int_tr
sel<-qd_reg_tr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Avrupa Birliği","AB",sel)
sel<-gsub(pattern = "Güney Afrika","Güney_Afrika",sel)
sel<-gsub(pattern = "Recep Tayyip Erdoğan","Erdoğan",sel)
stopw<-stopwords("tr", source = "stopwords-iso")
# Select a macroegion
sel$ref <- str_detect(sel$geo,"CO_AFR")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 6102 299
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("Afrika","Güney_Afrika")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = T)td<-tidy(sel)ANALYSIS 2.2 : SPECIFIC VOCABULARY OF A MACROREGION / MEDITERRANEAN
We take now the example of “Mediterranean” and examine what are the specific terms associated to this macroregion in the different corpora and subcorpora. We decide to use the “international” corpus as reference.
Tunisia
# Select the corpus
sel <- qd_int_tn
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Moyen Orient","Moyen_Orient",sel)
sel<-gsub(pattern = "Wall Street","Wall_Street",sel)
# Select a macroegion
sel$ref <- str_detect(sel$geo,"SE_medit")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 16498 52
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("Méditerranée","2ème","3e","Orange","5")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 25,show_reference = F)td<-tidy(sel)France
# Select the corpus
sel <- qd_int_fr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "SOS Méditerranée","SOS_Méditerranée",sel)
sel<-gsub(pattern = "Ocean Viking","Ocean_Viking",sel)
# Select a macroegion
sel$ref <- str_detect(sel$geo,"SE_medit")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 81809 287
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("Méditerranée","2ème","3e","19")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = F)td<-tidy(sel)Germany
# Select the corpus
sel <- qd_int_de
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Europas","Europa",sel)
sel<-gsub(pattern = "Europäische Union","UE",sel)
stopw<-stopwords("de", source = "stopwords-iso")
# Select a macroegion
sel$ref <- str_detect(sel$geo,"SE_medit")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 24094 92
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("Mittelmeer","3","100")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = F)td<-tidy(sel)Turkey
# Select the corpus
sel <- qd_int_tr
sel<-qd_reg_tr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Avrupa Birliği","AB",sel)
sel<-gsub(pattern = "Güney Afrika","Güney_Afrika",sel)
sel<-gsub(pattern = "Recep Tayyip Erdoğan","Erdoğan",sel)
stopw<-stopwords("tr", source = "stopwords-iso")
# Select a macroegion
sel$ref <- str_detect(sel$geo,"SE_medit")
table(sel$ref)FALSE
FALSE FALSE TRUE
FALSE 5869 532
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("Akdeniz")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=T) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, target = "TRUE")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = F)td<-tidy(sel)ANALYSIS 3.1 : SPECIFIC VOCABULARY OF TWO REGIONS : EU / EUrope
We take now the example of the two regions EU and Europe for a benchmarking of their respective vocabulary. We keep the international corpus as referenc
Tunisia
# Select the corpus
sel<-qd_int_tn
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Moyen Orient","Moyen_Orient",sel)
sel<-gsub(pattern = "Wall Street","Wall_Street",sel)
# Select news located in one of the two macroregions but not both
sel$ref1 <- str_detect(sel$geo,"OR_EU")
sel$ref2 <- str_detect(sel$geo,"CO_EUR")
sel<-corpus_subset(sel,(sel$ref1 | sel$ref2) & (sel$ref1!=sel$ref2))
sel$ref<-as.factor(sel$ref1)
levels(sel$ref) <- c("Europe","EU")
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("Europe","UE","19","2019")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel = "EU")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 10,n = 20,show_reference = T)td<-tidy(sel)France
# Select the corpus
sel <- qd_int_fr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Etats Unis","USA",sel)
sel<-gsub(pattern = "États Unis","USA",sel)
sel<-gsub(pattern = "SOS Méditerranée","SOS_Méditerranée",sel)
sel<-gsub(pattern = "Ocean Viking","Ocean_Viking",sel)
sel<-gsub(pattern = "Emmanuel Macron","Macron",sel)
# Select news located in one of the two macroregions but not both
sel$ref1 <- str_detect(sel$geo,"OR_EU")
sel$ref2 <- str_detect(sel$geo,"CO_EUR")
sel<-corpus_subset(sel,(sel$ref1 | sel$ref2) & (sel$ref1!=sel$ref2))
sel$ref<-as.factor(sel$ref1)
levels(sel$ref) <- c("Europe","EU")
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("Europe","UE","19","2019","27","°")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel = "EU")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = T)td<-tidy(sel)Germany
# Select the corpus
sel <- qd_int_de
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Europas","Europa",sel)
sel<-gsub(pattern = "Europäische Union","UE",sel)
stopw<-stopwords("de", source = "stopwords-iso")
# Select news located in one of the two macroregions but not both
sel$ref1 <- str_detect(sel$geo,"OR_EU")
sel$ref2 <- str_detect(sel$geo,"CO_EUR")
sel<-corpus_subset(sel,(sel$ref1 | sel$ref2) & (sel$ref1!=sel$ref2))
sel$ref<-as.factor(sel$ref1)
levels(sel$ref) <- c("Europe","EU")
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("Europa","EU","3","100")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel="Europe")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = T)td<-tidy(sel)Turkey
# Select the corpus
sel <- qd_int_tr
sel<-qd_reg_tr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Avrupa Birliği","AB",sel)
sel<-gsub(pattern = "Güney Afrika","Güney_Afrika",sel)
sel<-gsub(pattern = "Recep Tayyip Erdoğan","Erdoğan",sel)
sel<-gsub(pattern = "Christine Lagarde","Christine_Lagarde",sel)
sel<-gsub(pattern = "Avrupa Merkez Bankası","ECB",sel)
sel<-gsub(pattern = "Avrupa Merkez Bankasi","ECB",sel)
sel<-gsub(pattern = "Avrupa İmar ve Kalkınma Bankası","EBRD",sel)
stopw<-stopwords("tr", source = "stopwords-iso")
# Select news located in one of the two macroregions but not both
sel$ref1 <- str_detect(sel$geo,"OR_EU")
sel$ref2 <- str_detect(sel$geo,"CO_EUR")
sel<-corpus_subset(sel,(sel$ref1 | sel$ref2) & (sel$ref1!=sel$ref2))
sel$ref<-as.factor(sel$ref1)
levels(sel$ref) <- c("Europe","EU")
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("Avrupa","AB","den","nın","nin","ye")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 20, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel="Europe")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 20,n = 20,show_reference = T)td<-tidy(sel)ANALYSIS 3.2 : SPECIFIC VOCABULARY OF ONE REGION BY PERIOD : EU in 2018-19 & 2022_2023
We take now the example of the two regions EU and Europe for a benchmarking of their respective vocabulary. We keep the international corpus as referenc
Tunisia
# Select the corpus
sel<-qd_int_tn
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Moyen Orient","Moyen_Orient",sel)
sel<-gsub(pattern = "Wall Street","Wall_Street",sel)
sel<-gsub(pattern = "liste noire","liste_noire",sel)
# Select news located in EU within the two periods
sel<-corpus_subset(sel,str_detect(sel$geo,"OR_EU"))
sel$ref<-cut(sel$day,breaks=as.Date(c("2018-01-01","2020-01-01","2022-01-01","2024-01-01")))
levels(sel$ref) <-c("2018-19","2020-21","2022-23")
sel<-corpus_subset(sel,ref != "2020-21")
sel$ref<-as.factor(as.character(sel$ref))
table(sel$ref)FALSE
FALSE 2018-19 2022-23
FALSE 176 125
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("Europe","UE","19","2019","150")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 5, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel = "2018-19")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 5,n = 20,show_reference = T)td<-tidy(sel)France
# Select the corpus
sel <- qd_int_fr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Union Européenne","UE",sel)
sel<-gsub(pattern = "Union européenne","UE",sel)
sel<-gsub(pattern = "Etats Unis","USA",sel)
sel<-gsub(pattern = "États Unis","USA",sel)
sel<-gsub(pattern = "SOS Méditerranée","SOS_Méditerranée",sel)
sel<-gsub(pattern = "Ocean Viking","Ocean_Viking",sel)
sel<-gsub(pattern = "Emmanuel Macron","Macron",sel)
sel<-gsub(pattern = "Royaume Uni","Royaume_Uni",sel)
sel<-gsub(pattern = "von der Leyen","VDLeyen",sel)
sel<-gsub(pattern = "Von der Leyen","VDLeyen",sel)
sel<-gsub(pattern = "Theresa May","T_May",sel)
# Select news located in EU within the two periods
sel<-corpus_subset(sel,str_detect(sel$geo,"OR_EU"))
sel$ref<-cut(sel$day,breaks=as.Date(c("2018-01-01","2020-01-01","2022-01-01","2024-01-01")))
levels(sel$ref) <-c("2018-19","2020-21","2022-23")
sel<-corpus_subset(sel,ref != "2020-21")
sel$ref<-as.factor(as.character(sel$ref))
table(sel$ref)FALSE
FALSE 2018-19 2022-23
FALSE 901 334
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopwords("fr", source = "stopwords-iso")) %>%
tokens_remove(c("UE","19","2019","27","°","ans","2023")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel = "2018-19")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 10,n = 20,show_reference = T)td<-tidy(sel)Germany
# Select the corpus
sel <- qd_int_de
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Europas","Europa",sel)
sel<-gsub(pattern = "Europäische Union","EU",sel)
stopw<-stopwords("de", source = "stopwords-iso")
# Select news located in EU within the two periods
sel<-corpus_subset(sel,str_detect(sel$geo,"OR_EU"))
sel$ref<-cut(sel$day,breaks=as.Date(c("2018-01-01","2020-01-01","2022-01-01","2024-01-01")))
levels(sel$ref) <-c("2018-19","2020-21","2022-23")
sel<-corpus_subset(sel,ref != "2020-21")
sel$ref<-as.factor(as.character(sel$ref))
table(sel$ref)FALSE
FALSE 2018-19 2022-23
FALSE 781 264
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("EU","3","100")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel="2018-19")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 10,n = 20,show_reference = T)td<-tidy(sel)Turkey
# Select the corpus
sel <- qd_int_tr
sel<-qd_reg_tr
sel<-gsub(pattern = "'"," ",sel)
sel<-gsub(pattern = "’"," ",sel)
sel<-gsub(pattern = "-"," ",sel)
sel<-gsub(pattern = "Avrupa Birliği","AB",sel)
sel<-gsub(pattern = "Güney Afrika","Güney_Afrika",sel)
sel<-gsub(pattern = "Recep Tayyip Erdoğan","Erdoğan",sel)
sel<-gsub(pattern = "Christine Lagarde","Christine_Lagarde",sel)
sel<-gsub(pattern = "Avrupa Merkez Bankası","ECB",sel)
sel<-gsub(pattern = "Avrupa Merkez Bankasi","ECB",sel)
sel<-gsub(pattern = "Avrupa İmar ve Kalkınma Bankası","EBRD",sel)
sel<-gsub(pattern = "Ursula von der Leyen","VDLeyen",sel)
stopw<-stopwords("tr", source = "stopwords-iso")
# Select news located in EU within the two periods
sel<-corpus_subset(sel,str_detect(sel$geo,"OR_EU"))
sel$ref<-cut(sel$day,breaks=as.Date(c("2018-01-01","2020-01-01","2022-01-01","2024-01-01")))
levels(sel$ref) <-c("2018-19","2020-21","2022-23")
sel<-corpus_subset(sel,ref != "2020-21")
sel$ref<-as.factor(as.character(sel$ref))
table(sel$ref)FALSE
FALSE 2018-19 2022-23
FALSE 640 366
# Create a dfm grouped by presence/absence of the macroregion
pres_dfm <- tokens(sel, remove_punct = TRUE) %>%
tokens_remove(stopw) %>%
tokens_remove(c("AB","den","nın","nin","ye","2022","2021","3","e")) %>%
tokens_group(groups = ref) %>%
dfm(tolower=F) %>%
dfm_trim(min_termfreq = 10, verbose = FALSE)
# Calculate keyness and determine Trump as target group
result_keyness <- textstat_keyness(pres_dfm, sel="2018-19")
# Plot estimated word keyness
textplot_keyness(result_keyness,min_count = 10,n = 20,show_reference = T)td<-tidy(sel)